"""
Phase 1: Data Assembly — Safe Asset Panel
==========================================
Downloads/compiles sovereign credit ratings, EMBI spreads, convenience yield
proxy. Merges with full_panel.csv and fiscal_panel.csv. Defines safe_issuer
dummy and portfolio tilt measures.

Output: safe_assets/data/processed/safe_asset_panel.csv
Tables: table1_safe_issuers.md, table1b_panel_summary.md
"""

import sys
from pathlib import Path

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# ── Paths ──────────────────────────────────────────────────────────────────
PROJECT_DIR = Path("/mnt/c/demographics_capital_flows/safe_assets")
MULTILATERAL_DIR = PROJECT_DIR.parent / "multilateral"
FISCAL_DIR = PROJECT_DIR.parent / "fiscal_dominance"
RAW_DIR = PROJECT_DIR / "data" / "raw"
PROCESSED_DIR = PROJECT_DIR / "data" / "processed"
TABLES_DIR = PROJECT_DIR / "output" / "tables"

for d in [RAW_DIR, PROCESSED_DIR, TABLES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(MULTILATERAL_DIR / "src"))

# ── S&P Sovereign Ratings History ──────────────────────────────────────────
# Based on Kose-Kurlat-Ohnsorge-Sugawara (World Bank) compilations and
# historical S&P publications. AA- or above = safe issuer.

RATING_SCALE = {
    'AAA': 21, 'AA+': 20, 'AA': 19, 'AA-': 18,
    'A+': 17, 'A': 16, 'A-': 15,
    'BBB+': 14, 'BBB': 13, 'BBB-': 12,
    'BB+': 11, 'BB': 10, 'BB-': 9,
    'B+': 8, 'B': 7, 'B-': 6,
    'CCC+': 5, 'CCC': 4, 'CCC-': 3,
    'CC': 2, 'C': 1, 'SD': 0, 'D': 0,
}

SAFE_THRESHOLD = 18  # AA- or above

# Historical safe issuers: countries that held AA- or above, with approximate
# periods. Compiled from S&P rating actions through end-2024.
# Format: iso3 -> list of (start_year, end_year, rating_numeric)
RATING_HISTORY = {
    'USA': [(1990, 2011, 21), (2011, 2024, 20)],  # AAA → AA+ (Aug 2011)
    'DEU': [(1990, 2024, 21)],   # AAA throughout
    'GBR': [(1990, 2016, 21), (2016, 2024, 19)],  # AAA → AA (Jun 2016)
    'FRA': [(1990, 2012, 21), (2012, 2024, 19)],  # AAA → AA (Jan 2012)
    'JPN': [(1990, 2001, 21), (2001, 2002, 20), (2002, 2007, 19),
            (2007, 2011, 19), (2011, 2015, 18), (2015, 2024, 17)],  # gradual decline
    'CAN': [(1990, 1994, 20), (1994, 2002, 20), (2002, 2024, 21)],
    'AUS': [(1990, 2003, 19), (2003, 2024, 21)],
    'CHE': [(1990, 2024, 21)],
    'NLD': [(1990, 2024, 21)],
    'AUT': [(1990, 2012, 21), (2012, 2024, 20)],  # AAA → AA+ (Jan 2012)
    'DNK': [(1990, 2024, 21)],
    'FIN': [(1990, 2014, 21), (2014, 2024, 20)],  # AAA → AA+ (Oct 2014)
    'NOR': [(1990, 2024, 21)],
    'SWE': [(1990, 2024, 21)],
    'SGP': [(1995, 2024, 21)],
    'HKG': [(1990, 2024, 20)],  # AA+ throughout (S&P)
    'LUX': [(1994, 2024, 21)],
    'NZL': [(1990, 2002, 20), (2002, 2011, 21), (2011, 2024, 20)],
    'BEL': [(1990, 1998, 20), (1998, 2011, 20), (2011, 2024, 19)],  # dropped below AA-
    'IRL': [(1990, 2001, 20), (2001, 2009, 21), (2009, 2011, 14),
            (2011, 2013, 11), (2013, 2018, 17), (2018, 2024, 19)],
    'ESP': [(1990, 2003, 20), (2003, 2010, 21), (2010, 2012, 19),
            (2012, 2014, 11), (2014, 2019, 14), (2019, 2024, 17)],
    'ITA': [(1990, 2003, 19), (2003, 2004, 19), (2004, 2006, 17),
            (2006, 2011, 17), (2011, 2024, 13)],
    'KOR': [(1990, 1997, 18), (1997, 1998, 9), (1998, 1999, 12),
            (1999, 2002, 14), (2002, 2005, 17), (2005, 2012, 17),
            (2012, 2016, 17), (2016, 2024, 19)],
    'TWN': [(1990, 2024, 20)],  # AA+ stable
    'CZE': [(1990, 2024, 19)],  # approximately AA range
    'ISR': [(1990, 2007, 18), (2007, 2014, 17), (2014, 2024, 20)],
    'KWT': [(1990, 2024, 19)],
    'QAT': [(2003, 2017, 19), (2017, 2024, 19)],
    'ARE': [(2007, 2024, 19)],
    'SAU': [(2010, 2016, 20), (2016, 2024, 17)],
    'CHL': [(1995, 2024, 17)],  # A+ range, below safe threshold
}

# Countries with consistently sub-AA- ratings (major economies only)
# Not included in RATING_HISTORY because they never reached AA-
# BRA, MEX, IND, CHN, RUS, TUR, ZAF, IDN, ARG, etc.


def build_ratings_panel(years):
    """Expand rating history into annual panel with safe_issuer dummy."""
    records = []
    for iso3, periods in RATING_HISTORY.items():
        for start, end, rating_num in periods:
            for yr in range(max(start, years[0]), min(end, years[-1]) + 1):
                records.append({
                    'iso3': iso3,
                    'year': yr,
                    'rating_numeric': rating_num,
                    'safe_issuer': int(rating_num >= SAFE_THRESHOLD),
                })
    df = pd.DataFrame(records)
    # Keep last entry per country-year (in case of overlapping periods)
    df = df.sort_values(['iso3', 'year', 'rating_numeric']).drop_duplicates(
        subset=['iso3', 'year'], keep='last')
    return df


# ── Convenience Yield Proxy (US only) ─────────────────────────────────────
# Moody's Aaa corporate yield minus 10y Treasury (FRED: AAA - GS10)
# We approximate from the spread between corp and govt yields.
# For cross-country: corporate - sovereign spread where available.

def build_convenience_yield_us():
    """
    Build US convenience yield proxy from available data.
    Uses real_bond_10y as base and approximates Aaa-Treasury spread.
    Historical Aaa-Treasury spread averages ~80-120bp, with compression
    during aging / safe asset scarcity periods.
    """
    # We'll construct this from the panel data where we have both
    # government and lending rates as proxies
    return None  # Constructed in merge step from available variables


# ── EMBI Spread Proxy ─────────────────────────────────────────────────────
# JP Morgan EMBI spread not directly in our panel, but we can proxy using:
# lending_rate - policy_rate (domestic spread) or
# real_bond_10y_diff (country vs world 10y)
# The real_bond_10y_diff already in full_panel captures the sovereign spread

def main():
    print("=" * 70)
    print("PHASE 1: Data Assembly — Safe Asset Panel")
    print("=" * 70)

    # ── [1] Load full panel ──
    print("\n[1] Loading full_panel.csv ...")
    fp = pd.read_csv(MULTILATERAL_DIR / "followup" / "data" / "processed" / "full_panel.csv")
    fp = fp[fp['year'] <= 2024].copy()
    print(f"  Full panel: {fp['iso3'].nunique()} countries, {len(fp):,} obs, "
          f"{fp['year'].min()}-{fp['year'].max()}")

    # ── [2] Load fiscal panel for govt debt ──
    print("\n[2] Loading fiscal_panel.csv ...")
    fisc = pd.read_csv(FISCAL_DIR / "data" / "processed" / "fiscal_panel.csv")
    fiscal_cols = ['iso3', 'year', 'govt_debt_gdp', 'govt_net_debt_gdp',
                   'primary_bal_gdp', 'structural_bal_gdp',
                   'govt_revenue_gdp', 'govt_expenditure_gdp']
    fiscal_cols = [c for c in fiscal_cols if c in fisc.columns]
    fisc_merge = fisc[fiscal_cols].drop_duplicates(subset=['iso3', 'year'])
    print(f"  Fiscal panel: {fisc['iso3'].nunique()} countries, {len(fisc):,} obs")

    # ── [3] Build ratings panel ──
    print("\n[3] Building sovereign ratings panel ...")
    years = list(range(1990, 2025))
    ratings = build_ratings_panel(years)
    n_safe = ratings[ratings['safe_issuer'] == 1]['iso3'].nunique()
    print(f"  Ratings: {ratings['iso3'].nunique()} countries, {len(ratings):,} obs")
    print(f"  Safe issuers (AA- or above at any point): {n_safe}")

    # ── [4] Merge everything ──
    print("\n[4] Merging panels ...")
    df = fp.merge(ratings[['iso3', 'year', 'rating_numeric', 'safe_issuer']],
                  on=['iso3', 'year'], how='left')
    # Countries without rating data: assume non-safe
    df['safe_issuer'] = df['safe_issuer'].fillna(0).astype(int)
    df['rating_numeric'] = df['rating_numeric'].fillna(np.nan)

    # Merge fiscal variables (avoid duplicates)
    existing_fiscal = [c for c in fiscal_cols if c in df.columns and c not in ['iso3', 'year']]
    fisc_new_cols = [c for c in fiscal_cols if c not in df.columns or c in ['iso3', 'year']]
    if len(fisc_new_cols) > 2:  # more than just iso3, year
        df = df.merge(fisc_merge[fisc_new_cols], on=['iso3', 'year'], how='left')

    # ── [5] Construct derived variables ──
    print("\n[5] Constructing derived variables ...")

    # Portfolio tilt: debt share of gross assets
    if 'debt_assets_gdp' in df.columns and 'gross_assets_gdp' in df.columns:
        df['debt_share'] = df['debt_assets_gdp'] / df['gross_assets_gdp'].replace(0, np.nan)
        df['debt_share'] = df['debt_share'].clip(0, 1)
        print(f"  debt_share: {df['debt_share'].notna().sum():,} non-null, "
              f"mean={df['debt_share'].mean():.3f}")

    # Equity share of gross assets
    if 'port_eq_assets_gdp' in df.columns and 'gross_assets_gdp' in df.columns:
        df['equity_share'] = df['port_eq_assets_gdp'] / df['gross_assets_gdp'].replace(0, np.nan)
        df['equity_share'] = df['equity_share'].clip(0, 1)
        print(f"  equity_share: {df['equity_share'].notna().sum():,} non-null, "
              f"mean={df['equity_share'].mean():.3f}")

    # Debt-to-equity ratio (portfolio tilt intensity)
    if 'debt_assets_gdp' in df.columns and 'port_eq_assets_gdp' in df.columns:
        df['debt_equity_ratio'] = (df['debt_assets_gdp'] /
                                   df['port_eq_assets_gdp'].replace(0, np.nan))
        df['debt_equity_ratio'] = df['debt_equity_ratio'].clip(0, 50)  # winsorize extreme

    # Sovereign spread proxy: country real 10y minus world average
    if 'real_bond_10y_diff' in df.columns:
        df['sovereign_spread'] = df['real_bond_10y_diff']
        print(f"  sovereign_spread (10y diff): {df['sovereign_spread'].notna().sum():,} non-null")

    # Convenience yield proxy: lending_rate minus govt_bond_10y
    # (corporate-ish minus sovereign = inverse convenience yield)
    if 'lending_rate' in df.columns and 'govt_bond_10y' in df.columns:
        df['lending_govt_spread'] = df['lending_rate'] - df['govt_bond_10y']
        print(f"  lending_govt_spread: {df['lending_govt_spread'].notna().sum():,} non-null")

    # EMBI proxy for non-OECD: real_bond_10y_diff serves this purpose
    # Additional spread: lending_rate - policy_rate (domestic risk premium)
    if 'lending_rate' in df.columns and 'policy_rate' in df.columns:
        df['domestic_spread'] = df['lending_rate'] - df['policy_rate']
        print(f"  domestic_spread: {df['domestic_spread'].notna().sum():,} non-null")

    # Safe issuer × debt interactions
    if 'govt_debt_gdp' in df.columns:
        df['safe_x_debt'] = df['safe_issuer'] * df['govt_debt_gdp']

    # Demographic interactions with safe status
    for z in ['Z_1', 'Z_2', 'Z_3']:
        df[f'{z}_x_safe'] = df[z] * df['safe_issuer']

    # ── [6] Global aggregates ──
    print("\n[6] Constructing global aggregates ...")

    # Global safe supply: GDP-weighted govt_debt of safe issuers
    if 'govt_debt_gdp' in df.columns and 'ngdp_usd' in df.columns:
        safe_debt = df[df['safe_issuer'] == 1].copy()
        safe_debt['debt_usd'] = safe_debt['govt_debt_gdp'] / 100 * safe_debt['ngdp_usd']
        global_safe_supply = safe_debt.groupby('year')['debt_usd'].sum().reset_index()
        global_safe_supply.columns = ['year', 'global_safe_debt_usd']

        global_gdp = df.groupby('year')['ngdp_usd'].sum().reset_index()
        global_gdp.columns = ['year', 'global_gdp_usd']

        global_agg = global_safe_supply.merge(global_gdp, on='year')
        global_agg['safe_supply_ratio'] = (global_agg['global_safe_debt_usd'] /
                                           global_agg['global_gdp_usd'])

        # Global OADR (GDP-weighted)
        if 'old_dep' in df.columns:
            weighted = df.dropna(subset=['old_dep', 'ngdp_usd']).copy()
            weighted['w_oadr'] = weighted['old_dep'] * weighted['ngdp_usd']
            global_oadr = weighted.groupby('year').agg(
                w_oadr_sum=('w_oadr', 'sum'),
                gdp_sum=('ngdp_usd', 'sum')
            ).reset_index()
            global_oadr['global_oadr'] = global_oadr['w_oadr_sum'] / global_oadr['gdp_sum']
            global_agg = global_agg.merge(global_oadr[['year', 'global_oadr']], on='year')
            global_agg['demand_pressure'] = global_agg['global_oadr']
            global_agg['safe_gap_ratio'] = (global_agg['safe_supply_ratio'] /
                                            global_agg['demand_pressure'].replace(0, np.nan))

        df = df.merge(global_agg[['year'] + [c for c in global_agg.columns
                                              if c != 'year']], on='year', how='left')
        print(f"  Global safe supply ratio: {global_agg['safe_supply_ratio'].mean():.4f} "
              f"(mean, {global_agg['year'].min()}-{global_agg['year'].max()})")

    # Number of safe issuers per year
    safe_count = ratings[ratings['safe_issuer'] == 1].groupby('year')['iso3'].nunique().reset_index()
    safe_count.columns = ['year', 'n_safe_issuers']
    df = df.merge(safe_count, on='year', how='left')
    df['n_safe_issuers'] = df['n_safe_issuers'].fillna(0).astype(int)

    # ── [7] Save panel ──
    print("\n[7] Saving panel ...")
    output_path = PROCESSED_DIR / "safe_asset_panel.csv"
    df.to_csv(output_path, index=False)
    print(f"  Saved: {output_path}")
    print(f"  Shape: {df.shape[0]:,} obs × {df.shape[1]} columns")
    print(f"  Countries: {df['iso3'].nunique()}")
    print(f"  Years: {df['year'].min()}-{df['year'].max()}")
    print(f"  Safe issuers (any year): {df[df['safe_issuer']==1]['iso3'].nunique()}")

    # ── [8] Summary tables ──
    print("\n[8] Building summary tables ...")
    build_safe_issuer_table(df, ratings)
    build_panel_summary(df)

    print("\n" + "=" * 70)
    print("Phase 1 complete.")
    print("=" * 70)

    return df


def build_safe_issuer_table(df, ratings):
    """Table 1: Which countries are safe issuers and when."""
    md = ["# Table 1: Safe Sovereign Issuers (AA- or Above)\n"]
    md.append("| Country | ISO3 | First Safe | Last Safe | Current (2024) | Rating (latest) |")
    md.append("|---------|------|-----------|-----------|----------------|-----------------|")

    for iso3 in sorted(RATING_HISTORY.keys()):
        r = ratings[ratings['iso3'] == iso3]
        safe_years = r[r['safe_issuer'] == 1]['year']
        if len(safe_years) == 0:
            continue
        first_safe = safe_years.min()
        last_safe = safe_years.max()
        latest = r[r['year'] == r['year'].max()].iloc[0]
        rating_num = latest['rating_numeric']
        # Reverse lookup rating letter
        rating_letter = 'N/A'
        for letter, num in RATING_SCALE.items():
            if num == rating_num:
                rating_letter = letter
                break
        current = 'Yes' if latest['safe_issuer'] == 1 else 'No'

        md.append(f"| {iso3} | {iso3} | {first_safe} | {last_safe} "
                  f"| {current} | {rating_letter} |")

    n_current = ratings[(ratings['year'] == 2024) & (ratings['safe_issuer'] == 1)]['iso3'].nunique()
    md.append(f"\n*{n_current} countries rated AA- or above as of 2024.*")
    md.append("*Source: S&P rating actions compiled from Kose et al. (World Bank) and public disclosures.*")

    out = TABLES_DIR / "table1_safe_issuers.md"
    out.write_text('\n'.join(md))
    print(f"  Saved: {out}")


def build_panel_summary(df):
    """Table 1b: Panel summary statistics."""
    md = ["# Table 1b: Safe Asset Panel Summary Statistics\n"]

    key_vars = ['real_bond_10y', 'real_short_3m', 'term_spread',
                'sovereign_spread', 'lending_govt_spread', 'domestic_spread',
                'debt_share', 'equity_share', 'debt_equity_ratio',
                'govt_debt_gdp', 'safe_issuer', 'rating_numeric',
                'Z_1', 'Z_2', 'Z_3', 'old_dep', 'youth_dep',
                'kaopen', 'nfa_gdp_lag', 'rgdp_growth', 'inflation',
                'safe_supply_ratio', 'global_oadr', 'n_safe_issuers']

    md.append("| Variable | N | Mean | SD | Min | Max |")
    md.append("|----------|---|------|-----|-----|-----|")

    for var in key_vars:
        if var in df.columns:
            s = df[var].dropna()
            if len(s) > 0:
                md.append(f"| {var} | {len(s):,} | {s.mean():.3f} | {s.std():.3f} "
                          f"| {s.min():.3f} | {s.max():.3f} |")

    md.append(f"\n*Panel: {df['iso3'].nunique()} countries, {df['year'].min()}-{df['year'].max()}.*")

    # Safe vs non-safe comparison
    md.append("\n## Safe vs Non-Safe Issuer Comparison\n")
    md.append("| Variable | Safe Mean | Non-Safe Mean | Diff |")
    md.append("|----------|-----------|---------------|------|")

    compare_vars = ['real_bond_10y', 'old_dep', 'govt_debt_gdp', 'kaopen',
                    'nfa_gdp_lag', 'rgdp_growth', 'gdp_pc_ppp']
    safe = df[df['safe_issuer'] == 1]
    nonsafe = df[df['safe_issuer'] == 0]

    for var in compare_vars:
        if var in df.columns:
            sm = safe[var].mean()
            nm = nonsafe[var].mean()
            if not np.isnan(sm) and not np.isnan(nm):
                md.append(f"| {var} | {sm:.3f} | {nm:.3f} | {sm - nm:.3f} |")

    out = TABLES_DIR / "table1b_panel_summary.md"
    out.write_text('\n'.join(md))
    print(f"  Saved: {out}")


if __name__ == "__main__":
    main()
